The Thera bank recently saw a steep decline in the number of users of their credit card, credit cards are a good source of income for banks because of different kinds of fees charged by the banks like annual fees, balance transfer fees, and cash advance fees, late payment fees, foreign transaction fees, and others. Some fees are charged to every user irrespective of usage, while others are charged under specified circumstances.
Customers’ leaving credit cards services would lead bank to loss, so the bank wants to analyze the data of customers and identify the customers who will leave their credit card services and reason for same – so that bank could improve upon those areas
The purpose of this study is to help Thera bank by coming up with a classification model that will help the bank improve its services so that customers do not renounce their credit cards
# Library to suppress warnings or deprecation notes
import warnings
warnings.filterwarnings("ignore")
# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd
# Libraries to help with data visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# Libraries to split data, impute missing values
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
# Libraries to import decision tree classifier and different ensemble classifiers
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier
# To build a logistic regression model
from sklearn.linear_model import LogisticRegression
# Libtune to tune model, get different metric scores
from sklearn import metrics
from sklearn.metrics import (
confusion_matrix,
classification_report,
accuracy_score,
precision_score,
recall_score,
f1_score,
roc_auc_score,
)
# To oversample and undersample data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# To be used for tuning the model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
# to create pipeline and make_pipeline
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To supress scientific notations for a dataframe
pd.set_option("display.float_format", lambda x: "%.3f" % x)
# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
# To use statistical functions
import scipy.stats as stats
# This will help in making the Python code more structured automatically (good coding practice)
#%load_ext nb_black
# function to plot a boxplot and a histogram along the same scale.
def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
"""
Boxplot and histogram combined
data: dataframe
feature: dataframe column
figsize: size of figure (default (12,7))
kde: whether to the show density curve (default False)
bins: number of bins for histogram (default None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating the 2 subplots
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
) # boxplot will be created and a star will indicate the mean value of the column
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
) # For histogram
ax_hist2.axvline(
data[feature].mean(), color="green", linestyle="--"
) # Add mean to the histogram
ax_hist2.axvline(
data[feature].median(), color="black", linestyle="-"
) # Add median to the histogram
# function to create labeled barplots
def labeled_barplot(data, feature, perc=False, n=None):
"""
Barplot with percentage at the top
data: dataframe
feature: dataframe column
perc: whether to display percentages instead of count (default is False)
n: displays the top n category levels (default is None, i.e., display all levels)
"""
total = len(data[feature]) # length of the column
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 1, 5))
else:
plt.figure(figsize=(n + 1, 5))
plt.xticks(rotation=90, fontsize=15)
ax = sns.countplot(
data=data,
x=feature,
palette="Paired",
order=data[feature].value_counts().index[:n].sort_values(),
)
for p in ax.patches:
if perc == True:
label = "{:.1f}%".format(
100 * p.get_height() / total
) # percentage of each class of the category
else:
label = p.get_height() # count of each level of the category
x = p.get_x() + p.get_width() / 2 # width of the plot
y = p.get_height() # height of the plot
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
) # annotate the percentage
plt.show() # show the plot
def stacked_barplot(data, predictor, target):
"""
Print the category counts and plot a stacked bar chart
data: dataframe
predictor: independent variable
target: target variable
"""
count = data[predictor].nunique()
sorter = data[target].value_counts().index[-1]
tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
by=sorter, ascending=False
)
print(tab1)
print("-" * 120)
tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
by=sorter, ascending=False
)
tab.plot(kind="bar", stacked=True, figsize=(count + 5, 5))
plt.legend(
loc="lower left", frameon=False,
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.show()
churn = pd.read_csv("BankChurners.csv")
# copying data to another varaible to avoid any changes to original data
data = churn.copy()
# view the first 5 rows of the dataset
data.head()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | 5 | 1 | 3 | 12691.000 | 777 | 11914.000 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | 818770008 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | 6 | 1 | 2 | 8256.000 | 864 | 7392.000 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | 713982108 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | 4 | 1 | 0 | 3418.000 | 0 | 3418.000 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
| 3 | 769911858 | Existing Customer | 40 | F | 4 | High School | NaN | Less than $40K | Blue | 34 | 3 | 4 | 1 | 3313.000 | 2517 | 796.000 | 1.405 | 1171 | 20 | 2.333 | 0.760 |
| 4 | 709106358 | Existing Customer | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | 5 | 1 | 0 | 4716.000 | 0 | 4716.000 | 2.175 | 816 | 28 | 2.500 | 0.000 |
# view a random sample of the dataset 0f 10 rows
data.sample(n=10, random_state=1)
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6498 | 712389108 | Existing Customer | 43 | F | 2 | Graduate | Married | Less than $40K | Blue | 36 | 6 | 3 | 2 | 2570.000 | 2107 | 463.000 | 0.651 | 4058 | 83 | 0.766 | 0.820 |
| 9013 | 718388733 | Existing Customer | 38 | F | 1 | College | NaN | Less than $40K | Blue | 32 | 2 | 3 | 3 | 2609.000 | 1259 | 1350.000 | 0.871 | 8677 | 96 | 0.627 | 0.483 |
| 2053 | 710109633 | Existing Customer | 39 | M | 2 | College | Married | $60K - $80K | Blue | 31 | 6 | 3 | 2 | 9871.000 | 1061 | 8810.000 | 0.545 | 1683 | 34 | 0.478 | 0.107 |
| 3211 | 717331758 | Existing Customer | 44 | M | 4 | Graduate | Married | $120K + | Blue | 32 | 6 | 3 | 4 | 34516.000 | 2517 | 31999.000 | 0.765 | 4228 | 83 | 0.596 | 0.073 |
| 5559 | 709460883 | Attrited Customer | 38 | F | 2 | Doctorate | Married | Less than $40K | Blue | 28 | 5 | 2 | 4 | 1614.000 | 0 | 1614.000 | 0.609 | 2437 | 46 | 0.438 | 0.000 |
| 6106 | 789105183 | Existing Customer | 54 | M | 3 | Post-Graduate | Single | $80K - $120K | Silver | 42 | 3 | 1 | 2 | 34516.000 | 2488 | 32028.000 | 0.552 | 4401 | 87 | 0.776 | 0.072 |
| 4150 | 771342183 | Attrited Customer | 53 | F | 3 | Graduate | Single | $40K - $60K | Blue | 40 | 6 | 3 | 2 | 1625.000 | 0 | 1625.000 | 0.689 | 2314 | 43 | 0.433 | 0.000 |
| 2205 | 708174708 | Existing Customer | 38 | M | 4 | Graduate | Married | $40K - $60K | Blue | 27 | 6 | 2 | 4 | 5535.000 | 1276 | 4259.000 | 0.636 | 1764 | 38 | 0.900 | 0.231 |
| 4145 | 718076733 | Existing Customer | 43 | M | 1 | Graduate | Single | $60K - $80K | Silver | 31 | 4 | 3 | 3 | 25824.000 | 1170 | 24654.000 | 0.684 | 3101 | 73 | 0.780 | 0.045 |
| 5324 | 821889858 | Attrited Customer | 50 | F | 1 | Doctorate | Single | abc | Blue | 46 | 6 | 4 | 3 | 1970.000 | 1477 | 493.000 | 0.662 | 2493 | 44 | 0.571 | 0.750 |
The CLIENTNUM column looks like a client identifier and can be dropped. The income category has various ranges and is a mix of numbers and characters. Also one of the values is abc, this can be looked into more closely.
# checking the shape of the data
print(f"There are {data.shape[0]} rows and {data.shape[1]} columns.") # f-string
There are 10127 rows and 21 columns.
# Check for dupliucate values
data.duplicated().sum()
print(f"There are {data.duplicated().sum()} duplicate rows.") # f-string
There are 0 duplicate rows.
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CLIENTNUM 10127 non-null int64 1 Attrition_Flag 10127 non-null object 2 Customer_Age 10127 non-null int64 3 Gender 10127 non-null object 4 Dependent_count 10127 non-null int64 5 Education_Level 8608 non-null object 6 Marital_Status 9378 non-null object 7 Income_Category 10127 non-null object 8 Card_Category 10127 non-null object 9 Months_on_book 10127 non-null int64 10 Total_Relationship_Count 10127 non-null int64 11 Months_Inactive_12_mon 10127 non-null int64 12 Contacts_Count_12_mon 10127 non-null int64 13 Credit_Limit 10127 non-null float64 14 Total_Revolving_Bal 10127 non-null int64 15 Avg_Open_To_Buy 10127 non-null float64 16 Total_Amt_Chng_Q4_Q1 10127 non-null float64 17 Total_Trans_Amt 10127 non-null int64 18 Total_Trans_Ct 10127 non-null int64 19 Total_Ct_Chng_Q4_Q1 10127 non-null float64 20 Avg_Utilization_Ratio 10127 non-null float64 dtypes: float64(5), int64(10), object(6) memory usage: 1.6+ MB
Education_level and Marital_Status have nulls. Most of the columns are numeric except for Attrition_Flag, Gender, Education_level, Marital_Status, Income_Category, and Card_Category, these columns are of object datatype. We can look into value counts to see if some of the columns can be made categorical. Attrition_Flag is our target column and it is defined as object type have to look into this further.
# Print the unique values for each column
data.nunique()
CLIENTNUM 10127 Attrition_Flag 2 Customer_Age 45 Gender 2 Dependent_count 6 Education_Level 6 Marital_Status 3 Income_Category 6 Card_Category 4 Months_on_book 44 Total_Relationship_Count 6 Months_Inactive_12_mon 7 Contacts_Count_12_mon 7 Credit_Limit 6205 Total_Revolving_Bal 1974 Avg_Open_To_Buy 6813 Total_Amt_Chng_Q4_Q1 1158 Total_Trans_Amt 5033 Total_Trans_Ct 126 Total_Ct_Chng_Q4_Q1 830 Avg_Utilization_Ratio 964 dtype: int64
# Dropping CLIENTNUM column
data.drop(columns="CLIENTNUM", inplace=True)
# Replace the Attrition_Flag, if Existing Customer then 1; if Attrited Customer then 0
data["Attrition_Flag"].replace("Existing Customer", 1, inplace=True)
data["Attrition_Flag"].replace("Attrited Customer", 0, inplace=True)
# Printing histplot for columns who have less than or equal to 6 unique values
# to see if they are good candidates for categorical variables.
for i in data.columns:
n = len(pd.unique(data[i]))
if n <= 6:
plt.figure(figsize=(7, 4))
sns.histplot(data=data, x=i)
plt.show()
# Looking at abc value in Income_Category
data[(data.Income_Category =='abc')]
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 19 | 1 | 45 | F | 2 | Graduate | Married | abc | Blue | 37 | 6 | 1 | 2 | 14470.000 | 1157 | 13313.000 | 0.966 | 1207 | 21 | 0.909 | 0.080 |
| 28 | 1 | 44 | F | 3 | Uneducated | Single | abc | Blue | 34 | 5 | 2 | 2 | 10100.000 | 0 | 10100.000 | 0.525 | 1052 | 18 | 1.571 | 0.000 |
| 39 | 0 | 66 | F | 0 | Doctorate | Married | abc | Blue | 56 | 5 | 4 | 3 | 7882.000 | 605 | 7277.000 | 1.052 | 704 | 16 | 0.143 | 0.077 |
| 44 | 1 | 38 | F | 4 | Graduate | Single | abc | Blue | 28 | 2 | 3 | 3 | 9830.000 | 2055 | 7775.000 | 0.977 | 1042 | 23 | 0.917 | 0.209 |
| 58 | 1 | 44 | F | 5 | Graduate | Married | abc | Blue | 35 | 4 | 1 | 2 | 6273.000 | 978 | 5295.000 | 2.275 | 1359 | 25 | 1.083 | 0.156 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10021 | 0 | 30 | F | 1 | Graduate | Married | abc | Blue | 18 | 4 | 1 | 4 | 4377.000 | 2517 | 1860.000 | 0.941 | 8759 | 74 | 0.609 | 0.575 |
| 10040 | 0 | 50 | F | 3 | Doctorate | Single | abc | Blue | 36 | 4 | 3 | 3 | 5173.000 | 0 | 5173.000 | 0.912 | 8757 | 68 | 0.789 | 0.000 |
| 10083 | 1 | 42 | F | 4 | Uneducated | Married | abc | Blue | 23 | 4 | 1 | 2 | 8348.000 | 0 | 8348.000 | 0.695 | 15905 | 111 | 0.708 | 0.000 |
| 10092 | 0 | 40 | F | 3 | Graduate | Married | abc | Blue | 25 | 1 | 2 | 3 | 6888.000 | 1878 | 5010.000 | 1.059 | 9038 | 64 | 0.829 | 0.273 |
| 10119 | 0 | 55 | F | 3 | Uneducated | Single | abc | Blue | 47 | 4 | 3 | 3 | 14657.000 | 2517 | 12140.000 | 0.166 | 6009 | 53 | 0.514 | 0.172 |
1112 rows × 20 columns
There are 1,112 rows where the Income_Category is abc. This looks like a data entry error and will be updated with 'unknown'
# Replace the abc value to unknown for Income_Category
data["Income_Category"].replace("abc", "unknown", inplace=True)
data["Income_Category"].value_counts()
Less than $40K 3561 $40K - $60K 1790 $80K - $120K 1535 $60K - $80K 1402 unknown 1112 $120K + 727 Name: Income_Category, dtype: int64
# Converting the data type of each categorical variable to 'category'
cat_col = [
"Gender",
"Dependent_count",
"Education_Level",
"Marital_Status",
"Income_Category",
"Card_Category",
"Total_Relationship_Count",
]
for column in cat_col:
data[column] = data[column].astype("category")
# Convert the Attrition_Flag from object to int
data["Attrition_Flag"] = data["Attrition_Flag"].astype("int64")
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition_Flag 10127 non-null int64 1 Customer_Age 10127 non-null int64 2 Gender 10127 non-null category 3 Dependent_count 10127 non-null category 4 Education_Level 8608 non-null category 5 Marital_Status 9378 non-null category 6 Income_Category 10127 non-null category 7 Card_Category 10127 non-null category 8 Months_on_book 10127 non-null int64 9 Total_Relationship_Count 10127 non-null category 10 Months_Inactive_12_mon 10127 non-null int64 11 Contacts_Count_12_mon 10127 non-null int64 12 Credit_Limit 10127 non-null float64 13 Total_Revolving_Bal 10127 non-null int64 14 Avg_Open_To_Buy 10127 non-null float64 15 Total_Amt_Chng_Q4_Q1 10127 non-null float64 16 Total_Trans_Amt 10127 non-null int64 17 Total_Trans_Ct 10127 non-null int64 18 Total_Ct_Chng_Q4_Q1 10127 non-null float64 19 Avg_Utilization_Ratio 10127 non-null float64 dtypes: category(7), float64(5), int64(8) memory usage: 1.1 MB
data.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Attrition_Flag | 10127.000 | 0.839 | 0.367 | 0.000 | 1.000 | 1.000 | 1.000 | 1.000 |
| Customer_Age | 10127.000 | 46.326 | 8.017 | 26.000 | 41.000 | 46.000 | 52.000 | 73.000 |
| Months_on_book | 10127.000 | 35.928 | 7.986 | 13.000 | 31.000 | 36.000 | 40.000 | 56.000 |
| Months_Inactive_12_mon | 10127.000 | 2.341 | 1.011 | 0.000 | 2.000 | 2.000 | 3.000 | 6.000 |
| Contacts_Count_12_mon | 10127.000 | 2.455 | 1.106 | 0.000 | 2.000 | 2.000 | 3.000 | 6.000 |
| Credit_Limit | 10127.000 | 8631.954 | 9088.777 | 1438.300 | 2555.000 | 4549.000 | 11067.500 | 34516.000 |
| Total_Revolving_Bal | 10127.000 | 1162.814 | 814.987 | 0.000 | 359.000 | 1276.000 | 1784.000 | 2517.000 |
| Avg_Open_To_Buy | 10127.000 | 7469.140 | 9090.685 | 3.000 | 1324.500 | 3474.000 | 9859.000 | 34516.000 |
| Total_Amt_Chng_Q4_Q1 | 10127.000 | 0.760 | 0.219 | 0.000 | 0.631 | 0.736 | 0.859 | 3.397 |
| Total_Trans_Amt | 10127.000 | 4404.086 | 3397.129 | 510.000 | 2155.500 | 3899.000 | 4741.000 | 18484.000 |
| Total_Trans_Ct | 10127.000 | 64.859 | 23.473 | 10.000 | 45.000 | 67.000 | 81.000 | 139.000 |
| Total_Ct_Chng_Q4_Q1 | 10127.000 | 0.712 | 0.238 | 0.000 | 0.582 | 0.702 | 0.818 | 3.714 |
| Avg_Utilization_Ratio | 10127.000 | 0.275 | 0.276 | 0.000 | 0.023 | 0.176 | 0.503 | 0.999 |
# For every column in the dataframe that has a numerical datatype, show histogram and box plot
for colname in (
data.columns[data.dtypes == "int64"] | data.columns[data.dtypes == "float64"]
):
histogram_boxplot(data, colname)
# Obersvation on Avg_Open_To_Buy. We saw above that this has many upper outliers, lets look at them
data[(data.Avg_Open_To_Buy > 25000)]
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6 | 1 | 51 | M | 4 | NaN | Married | $120K + | Gold | 46 | 6 | 1 | 3 | 34516.000 | 2264 | 32252.000 | 1.975 | 1330 | 31 | 0.722 | 0.066 |
| 7 | 1 | 32 | M | 0 | High School | NaN | $60K - $80K | Silver | 27 | 2 | 2 | 2 | 29081.000 | 1396 | 27685.000 | 2.204 | 1538 | 36 | 0.714 | 0.048 |
| 16 | 1 | 48 | M | 4 | Post-Graduate | Single | $80K - $120K | Blue | 36 | 6 | 2 | 3 | 30367.000 | 2362 | 28005.000 | 1.708 | 1671 | 27 | 0.929 | 0.078 |
| 40 | 1 | 45 | M | 3 | Graduate | Single | $80K - $120K | Blue | 41 | 2 | 2 | 2 | 32426.000 | 578 | 31848.000 | 1.042 | 1109 | 28 | 0.474 | 0.018 |
| 45 | 1 | 49 | M | 4 | Uneducated | Single | $80K - $120K | Blue | 30 | 3 | 2 | 3 | 34516.000 | 0 | 34516.000 | 1.621 | 1444 | 28 | 1.333 | 0.000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10098 | 0 | 55 | M | 3 | Graduate | Single | $120K + | Silver | 36 | 4 | 3 | 4 | 34516.000 | 0 | 34516.000 | 1.007 | 9931 | 70 | 0.750 | 0.000 |
| 10100 | 1 | 39 | M | 2 | Graduate | NaN | $60K - $80K | Silver | 36 | 4 | 2 | 2 | 29808.000 | 0 | 29808.000 | 0.669 | 16098 | 128 | 0.684 | 0.000 |
| 10104 | 1 | 51 | M | 3 | Graduate | Single | $60K - $80K | Silver | 36 | 3 | 2 | 2 | 29663.000 | 1743 | 27920.000 | 0.667 | 14638 | 93 | 0.722 | 0.059 |
| 10110 | 1 | 56 | M | 1 | Graduate | Single | $80K - $120K | Silver | 49 | 5 | 2 | 2 | 34516.000 | 1091 | 33425.000 | 0.640 | 15274 | 108 | 0.714 | 0.032 |
| 10112 | 0 | 33 | M | 2 | College | Married | $120K + | Gold | 20 | 2 | 1 | 4 | 34516.000 | 0 | 34516.000 | 1.004 | 9338 | 73 | 0.622 | 0.000 |
835 rows × 20 columns
There are 835 rows where the Avg_OpenTo_Buy is more than 25, 000, but this looks normal as the credit limit is also more than 25, 000. Both Credit Limit and Avg_Open_to_Buy have outliers as there is a wide range of credit limits that customers can have. The max credit limit is 34,516 and the max avg_open_to_buy is also 34, 516. Since there are only 835 rows we will treat the outliers by capping
# Obersvation on Contacts_Count_12_mon. We saw above that this has outliers on both ends, lets look at them
data.Contacts_Count_12_mon.value_counts(normalize=True)
3 0.334 2 0.319 1 0.148 4 0.137 0 0.039 5 0.017 6 0.005 Name: Contacts_Count_12_mon, dtype: float64
It looks like 64% of the customers were contacted 2 or 3 times. Around 14% of the customers were contacted 1 or 4 times. only 4% of the customers were contacted 0 times. And less than 2% of the customers were contacted 5 times and less than half percent were contacted 6 times. The outliers are very minimal so we will go ahead and treat the outliers lower and upper outliers
# Obersvation on Credit_Limit. We saw above that this has many upper outliers, lets look at them
data[(data.Credit_Limit > 25000)]
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6 | 1 | 51 | M | 4 | NaN | Married | $120K + | Gold | 46 | 6 | 1 | 3 | 34516.000 | 2264 | 32252.000 | 1.975 | 1330 | 31 | 0.722 | 0.066 |
| 7 | 1 | 32 | M | 0 | High School | NaN | $60K - $80K | Silver | 27 | 2 | 2 | 2 | 29081.000 | 1396 | 27685.000 | 2.204 | 1538 | 36 | 0.714 | 0.048 |
| 16 | 1 | 48 | M | 4 | Post-Graduate | Single | $80K - $120K | Blue | 36 | 6 | 2 | 3 | 30367.000 | 2362 | 28005.000 | 1.708 | 1671 | 27 | 0.929 | 0.078 |
| 40 | 1 | 45 | M | 3 | Graduate | Single | $80K - $120K | Blue | 41 | 2 | 2 | 2 | 32426.000 | 578 | 31848.000 | 1.042 | 1109 | 28 | 0.474 | 0.018 |
| 45 | 1 | 49 | M | 4 | Uneducated | Single | $80K - $120K | Blue | 30 | 3 | 2 | 3 | 34516.000 | 0 | 34516.000 | 1.621 | 1444 | 28 | 1.333 | 0.000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10098 | 0 | 55 | M | 3 | Graduate | Single | $120K + | Silver | 36 | 4 | 3 | 4 | 34516.000 | 0 | 34516.000 | 1.007 | 9931 | 70 | 0.750 | 0.000 |
| 10100 | 1 | 39 | M | 2 | Graduate | NaN | $60K - $80K | Silver | 36 | 4 | 2 | 2 | 29808.000 | 0 | 29808.000 | 0.669 | 16098 | 128 | 0.684 | 0.000 |
| 10104 | 1 | 51 | M | 3 | Graduate | Single | $60K - $80K | Silver | 36 | 3 | 2 | 2 | 29663.000 | 1743 | 27920.000 | 0.667 | 14638 | 93 | 0.722 | 0.059 |
| 10110 | 1 | 56 | M | 1 | Graduate | Single | $80K - $120K | Silver | 49 | 5 | 2 | 2 | 34516.000 | 1091 | 33425.000 | 0.640 | 15274 | 108 | 0.714 | 0.032 |
| 10112 | 0 | 33 | M | 2 | College | Married | $120K + | Gold | 20 | 2 | 1 | 4 | 34516.000 | 0 | 34516.000 | 1.004 | 9338 | 73 | 0.622 | 0.000 |
892 rows × 20 columns
There are 892 rows where the Credit_Limit is more than 25000. The maximum credit_Limit is 34516. This is understandable as different customers will have different credit limits. Since there are only 892 rows we can treat the outliers.
# For every column in the dataframe that has a categorical or object datatype, show distribution plot
# The unique values will be shown only if they are less than 20
for colname in (
data.columns[data.dtypes == "category"] | data.columns[data.dtypes == "object"]
):
labeled_barplot(data, colname, perc=True)
sns.pairplot(data=data, hue="Attrition_Flag")
<seaborn.axisgrid.PairGrid at 0x1665a5a9d90>
correlation = data.corr()
plt.figure(figsize=(15, 7))
sns.heatmap(correlation, annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()
# Dropping highly corealted variables, Months_on_book and Total_Trans_Amt
data.drop(columns="Months_on_book", inplace=True)
data.drop(columns="Total_Trans_Amt", inplace=True)
# For every category column print a countplot with category and attrition_flag to see how they are related.
for colname in data.columns[data.dtypes == "category"]:
stacked_barplot(data, colname, "Attrition_Flag")
Attrition_Flag 0 1 All Gender All 1627 8500 10127 F 930 4428 5358 M 697 4072 4769 ------------------------------------------------------------------------------------------------------------------------
Attrition_Flag 0 1 All Dependent_count All 1627 8500 10127 3 482 2250 2732 2 417 2238 2655 1 269 1569 1838 4 260 1314 1574 0 135 769 904 5 64 360 424 ------------------------------------------------------------------------------------------------------------------------
Attrition_Flag 0 1 All Education_Level All 1371 7237 8608 Graduate 487 2641 3128 High School 306 1707 2013 Uneducated 237 1250 1487 College 154 859 1013 Doctorate 95 356 451 Post-Graduate 92 424 516 ------------------------------------------------------------------------------------------------------------------------
Attrition_Flag 0 1 All Marital_Status All 1498 7880 9378 Married 709 3978 4687 Single 668 3275 3943 Divorced 121 627 748 ------------------------------------------------------------------------------------------------------------------------
Attrition_Flag 0 1 All Income_Category All 1627 8500 10127 Less than $40K 612 2949 3561 $40K - $60K 271 1519 1790 $80K - $120K 242 1293 1535 $60K - $80K 189 1213 1402 unknown 187 925 1112 $120K + 126 601 727 ------------------------------------------------------------------------------------------------------------------------
Attrition_Flag 0 1 All Card_Category All 1627 8500 10127 Blue 1519 7917 9436 Silver 82 473 555 Gold 21 95 116 Platinum 5 15 20 ------------------------------------------------------------------------------------------------------------------------
Attrition_Flag 0 1 All Total_Relationship_Count All 1627 8500 10127 3 400 1905 2305 2 346 897 1243 1 233 677 910 5 227 1664 1891 4 225 1687 1912 6 196 1670 1866 ------------------------------------------------------------------------------------------------------------------------
ax = sns.lineplot(x="Customer_Age", y="Attrition_Flag", data=data)
ax.invert_yaxis()
There is a small spike of attrition where the customer age is 30, then there are 2 more strong spikes at age above 65 and less than 70.
plt.figure(figsize=(15, 5))
sns.boxplot(y="Total_Trans_Ct", x="Attrition_Flag", data=data)
plt.show()
Customers who attrited have a very low total_trans_ct as compared to existing customers.
plt.figure(figsize=(15, 5))
sns.boxplot(y="Credit_Limit", x="Attrition_Flag", data=data)
plt.show()
Customers who attrited had a lesser credit limit as compared to existing customers. The data is skewed for both classes.
plt.figure(figsize=(15, 5))
sns.barplot(y="Months_Inactive_12_mon", x="Attrition_Flag", data=data)
plt.show()
Customers who attrited had accounts that are inactive for 12 months than Existing Customers
data[(data["Card_Category"] == "Platinum") & (data["Attrition_Flag"] == 1)].describe(
include="all"
).T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Attrition_Flag | 15.000 | NaN | NaN | NaN | 1.000 | 0.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 |
| Customer_Age | 15.000 | NaN | NaN | NaN | 46.867 | 4.941 | 39.000 | 43.500 | 45.000 | 51.000 | 56.000 |
| Gender | 15 | 2 | M | 10 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Dependent_count | 15.000 | 5.000 | 3.000 | 7.000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Education_Level | 12 | 5 | Graduate | 5 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Marital_Status | 13 | 3 | Single | 7 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Income_Category | 15 | 5 | $120K + | 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Card_Category | 15 | 1 | Platinum | 15 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Total_Relationship_Count | 15.000 | 4.000 | 1.000 | 5.000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Months_Inactive_12_mon | 15.000 | NaN | NaN | NaN | 2.133 | 0.915 | 1.000 | 1.500 | 2.000 | 3.000 | 4.000 |
| Contacts_Count_12_mon | 15.000 | NaN | NaN | NaN | 2.267 | 0.961 | 1.000 | 1.500 | 2.000 | 3.000 | 4.000 |
| Credit_Limit | 15.000 | NaN | NaN | NaN | 32045.467 | 6519.721 | 15987.000 | 34516.000 | 34516.000 | 34516.000 | 34516.000 |
| Total_Revolving_Bal | 15.000 | NaN | NaN | NaN | 1602.800 | 519.124 | 0.000 | 1491.000 | 1578.000 | 1919.000 | 2262.000 |
| Avg_Open_To_Buy | 15.000 | NaN | NaN | NaN | 30442.667 | 6650.801 | 13725.000 | 32555.500 | 32793.000 | 33006.500 | 34516.000 |
| Total_Amt_Chng_Q4_Q1 | 15.000 | NaN | NaN | NaN | 0.779 | 0.095 | 0.628 | 0.728 | 0.746 | 0.857 | 0.988 |
| Total_Trans_Ct | 15.000 | NaN | NaN | NaN | 95.867 | 27.537 | 29.000 | 92.500 | 102.000 | 114.000 | 127.000 |
| Total_Ct_Chng_Q4_Q1 | 15.000 | NaN | NaN | NaN | 0.706 | 0.149 | 0.481 | 0.607 | 0.708 | 0.781 | 1.071 |
| Avg_Utilization_Ratio | 15.000 | NaN | NaN | NaN | 0.055 | 0.030 | 0.000 | 0.043 | 0.050 | 0.057 | 0.141 |
Customer Profile for customers who opt for Platinum Card is as follows:
data[(data["Card_Category"] == "Gold") & (data["Attrition_Flag"] == 1)].describe(
include="all"
).T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Attrition_Flag | 95.000 | NaN | NaN | NaN | 1.000 | 0.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 |
| Customer_Age | 95.000 | NaN | NaN | NaN | 45.789 | 6.633 | 29.000 | 41.000 | 46.000 | 50.000 | 63.000 |
| Gender | 95 | 2 | M | 65 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Dependent_count | 95.000 | 6.000 | 3.000 | 26.000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Education_Level | 81 | 6 | Graduate | 30 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Marital_Status | 87 | 3 | Single | 47 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Income_Category | 95 | 6 | $60K - $80K | 23 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Card_Category | 95 | 1 | Gold | 95 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Total_Relationship_Count | 95.000 | 6.000 | 2.000 | 29.000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Months_Inactive_12_mon | 95.000 | NaN | NaN | NaN | 2.242 | 1.146 | 1.000 | 1.000 | 2.000 | 3.000 | 6.000 |
| Contacts_Count_12_mon | 95.000 | NaN | NaN | NaN | 2.326 | 0.994 | 0.000 | 2.000 | 2.000 | 3.000 | 5.000 |
| Credit_Limit | 95.000 | NaN | NaN | NaN | 28093.158 | 8854.919 | 3735.000 | 19354.500 | 34516.000 | 34516.000 | 34516.000 |
| Total_Revolving_Bal | 95.000 | NaN | NaN | NaN | 1414.463 | 700.218 | 0.000 | 1048.000 | 1540.000 | 1956.500 | 2517.000 |
| Avg_Open_To_Buy | 95.000 | NaN | NaN | NaN | 26678.695 | 8823.170 | 2261.000 | 18459.500 | 32455.000 | 33097.500 | 34516.000 |
| Total_Amt_Chng_Q4_Q1 | 95.000 | NaN | NaN | NaN | 0.773 | 0.186 | 0.436 | 0.700 | 0.739 | 0.811 | 1.975 |
| Total_Trans_Ct | 95.000 | NaN | NaN | NaN | 86.305 | 26.475 | 22.000 | 71.000 | 88.000 | 103.500 | 131.000 |
| Total_Ct_Chng_Q4_Q1 | 95.000 | NaN | NaN | NaN | 0.725 | 0.170 | 0.308 | 0.617 | 0.724 | 0.798 | 1.250 |
| Avg_Utilization_Ratio | 95.000 | NaN | NaN | NaN | 0.062 | 0.063 | 0.000 | 0.033 | 0.053 | 0.071 | 0.395 |
Customer Profile for customers who opt for Gold Card is as follows:
data[(data["Card_Category"] == "Blue") & (data["Attrition_Flag"] == 1)].describe(
include="all"
).T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Attrition_Flag | 7917.000 | NaN | NaN | NaN | 1.000 | 0.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 |
| Customer_Age | 7917.000 | NaN | NaN | NaN | 46.317 | 8.139 | 26.000 | 41.000 | 46.000 | 52.000 | 73.000 |
| Gender | 7917 | 2 | F | 4211 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Dependent_count | 7917.000 | 6.000 | 2.000 | 2098.000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Education_Level | 6740 | 6 | Graduate | 2449 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Marital_Status | 7354 | 3 | Married | 3759 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Income_Category | 7917 | 6 | Less than $40K | 2817 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Card_Category | 7917 | 1 | Blue | 7917 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Total_Relationship_Count | 7917.000 | 6.000 | 3.000 | 1790.000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Months_Inactive_12_mon | 7917.000 | NaN | NaN | NaN | 2.278 | 1.021 | 0.000 | 1.000 | 2.000 | 3.000 | 6.000 |
| Contacts_Count_12_mon | 7917.000 | NaN | NaN | NaN | 2.357 | 1.087 | 0.000 | 2.000 | 2.000 | 3.000 | 5.000 |
| Credit_Limit | 7917.000 | NaN | NaN | NaN | 7468.545 | 7673.216 | 1438.300 | 2535.000 | 4163.000 | 9227.000 | 34516.000 |
| Total_Revolving_Bal | 7917.000 | NaN | NaN | NaN | 1251.592 | 759.004 | 0.000 | 794.000 | 1353.000 | 1801.000 | 2517.000 |
| Avg_Open_To_Buy | 7917.000 | NaN | NaN | NaN | 6216.953 | 7691.475 | 15.000 | 1102.000 | 2978.000 | 7904.000 | 34516.000 |
| Total_Amt_Chng_Q4_Q1 | 7917.000 | NaN | NaN | NaN | 0.773 | 0.220 | 0.256 | 0.640 | 0.742 | 0.861 | 3.397 |
| Total_Trans_Ct | 7917.000 | NaN | NaN | NaN | 67.827 | 22.366 | 11.000 | 53.000 | 70.000 | 82.000 | 139.000 |
| Total_Ct_Chng_Q4_Q1 | 7917.000 | NaN | NaN | NaN | 0.744 | 0.230 | 0.028 | 0.617 | 0.722 | 0.837 | 3.714 |
| Avg_Utilization_Ratio | 7917.000 | NaN | NaN | NaN | 0.314 | 0.274 | 0.000 | 0.066 | 0.244 | 0.552 | 0.994 |
Customer Profile for customers who opt for Blue Card is as follows:
data[(data["Card_Category"] == "Silver") & (data["Attrition_Flag"] == 1)].describe(
include="all"
).T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Attrition_Flag | 473.000 | NaN | NaN | NaN | 1.000 | 0.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 |
| Customer_Age | 473.000 | NaN | NaN | NaN | 45.423 | 7.392 | 26.000 | 41.000 | 45.000 | 50.000 | 65.000 |
| Gender | 473 | 2 | M | 291 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Dependent_count | 473.000 | 6.000 | 3.000 | 127.000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Education_Level | 404 | 6 | Graduate | 157 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Marital_Status | 426 | 3 | Single | 208 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Income_Category | 473 | 6 | Less than $40K | 110 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Card_Category | 473 | 1 | Silver | 473 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Total_Relationship_Count | 473.000 | 6.000 | 3.000 | 95.000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Months_Inactive_12_mon | 473.000 | NaN | NaN | NaN | 2.209 | 0.909 | 0.000 | 2.000 | 2.000 | 3.000 | 6.000 |
| Contacts_Count_12_mon | 473.000 | NaN | NaN | NaN | 2.349 | 1.008 | 0.000 | 2.000 | 2.000 | 3.000 | 5.000 |
| Credit_Limit | 473.000 | NaN | NaN | NaN | 25159.529 | 9587.486 | 6224.000 | 15198.000 | 29081.000 | 34516.000 | 34516.000 |
| Total_Revolving_Bal | 473.000 | NaN | NaN | NaN | 1297.818 | 748.726 | 0.000 | 902.000 | 1433.000 | 1847.000 | 2517.000 |
| Avg_Open_To_Buy | 473.000 | NaN | NaN | NaN | 23861.710 | 9563.531 | 5447.000 | 13994.000 | 27920.000 | 32830.000 | 34516.000 |
| Total_Amt_Chng_Q4_Q1 | 473.000 | NaN | NaN | NaN | 0.772 | 0.196 | 0.293 | 0.662 | 0.752 | 0.858 | 2.368 |
| Total_Trans_Ct | 473.000 | NaN | NaN | NaN | 78.414 | 26.718 | 13.000 | 62.000 | 77.000 | 100.000 | 134.000 |
| Total_Ct_Chng_Q4_Q1 | 473.000 | NaN | NaN | NaN | 0.726 | 0.212 | 0.182 | 0.620 | 0.714 | 0.795 | 2.429 |
| Avg_Utilization_Ratio | 473.000 | NaN | NaN | NaN | 0.062 | 0.050 | 0.000 | 0.031 | 0.053 | 0.084 | 0.234 |
Customer Profile for customers who opt for Silver Card is as follows:
These profiles can act as a preliminary step to categorize customers for different packages and based on these profiles:
# Outliers detection using box plot
# Listing the columns with continous numeric data and outliers
numerical_col = data[
[
"Avg_Open_To_Buy",
"Contacts_Count_12_mon",
"Credit_Limit",
"Customer_Age",
"Months_Inactive_12_mon",
"Total_Amt_Chng_Q4_Q1",
"Total_Ct_Chng_Q4_Q1",
"Total_Trans_Ct",
]
]
for colname in numerical_col:
histogram_boxplot(data, colname)
# functions to treat outliers by flooring and capping
def treat_outliers(df, col):
"""
Treats outliers in a variable
df: dataframe
col: dataframe column
"""
Q1 = df[col].quantile(0.25) # 25th quantile
Q3 = df[col].quantile(0.75) # 75th quantile
IQR = Q3 - Q1
Lower_Whisker = Q1 - 1.5 * IQR
Upper_Whisker = Q3 + 1.5 * IQR
# all the values smaller than Lower_Whisker will be assigned the value of Lower_Whisker
# all the values greater than Upper_Whisker will be assigned the value of Upper_Whisker
df[col] = np.clip(df[col], Lower_Whisker, Upper_Whisker)
return df
def treat_outliers_all(df, col_list):
"""
Treat outliers in a list of variables
df: dataframe
col_list: list of dataframe columns
"""
for c in col_list:
df = treat_outliers(df, c)
return df
data = treat_outliers_all(data, numerical_col)
# let's look at box plot to see if outliers have been treated or not
for colname in numerical_col:
histogram_boxplot(data, colname)
The outliers have been treated
Predicting a customer will not attrite, and the customer does attrite.
We would want Recall to be maximized, the greater the Recall higher the chances of minimizing false negatives.
# To prevent data leakage the data will be split first before imputing nulls
X = data.drop(["Attrition_Flag"], axis=1)
y = data["Attrition_Flag"]
# Splitting data into training, validation and test sets:
# first we split data into 2 parts, say temporary and test
X_temp, X_test, y_temp, y_test = train_test_split(
X, y, test_size=0.2, random_state=1, stratify=y
)
# then we split the temporary set into train and validation
X_train, X_val, y_train, y_val = train_test_split(
X_temp, y_temp, test_size=0.25, random_state=1, stratify=y_temp
)
print(X_train.shape, X_val.shape, X_test.shape)
(6075, 17) (2026, 17) (2026, 17)
X_train.head()
| Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9501 | 47.000 | M | 2 | NaN | Divorced | $80K - $120K | Blue | 1 | 2.000 | 2.000 | 21714.000 | 1969 | 19745.000 | 0.944 | 104.000 | 0.625 | 0.091 |
| 5065 | 49.000 | F | 4 | Doctorate | Single | Less than $40K | Blue | 5 | 1.000 | 4.000 | 7789.000 | 957 | 6832.000 | 0.724 | 70.000 | 0.842 | 0.123 |
| 2375 | 53.000 | F | 2 | Graduate | Married | Less than $40K | Blue | 6 | 1.000 | 3.000 | 3176.000 | 1470 | 1706.000 | 0.388 | 53.000 | 0.472 | 0.463 |
| 7579 | 56.000 | M | 2 | Graduate | Divorced | $120K + | Blue | 3 | 3.000 | 1.000 | 3296.000 | 1435 | 1861.000 | 0.968 | 66.000 | 0.737 | 0.435 |
| 2776 | 47.000 | M | 4 | Uneducated | Married | $60K - $80K | Blue | 3 | 3.000 | 3.000 | 17557.000 | 0 | 17557.000 | 0.667 | 62.000 | 0.378 | 0.000 |
print("Shape of Training set : ", X_train.shape)
print("Shape of validation set : ", X_val.shape)
print("Shape of test set : ", X_test.shape)
print("Percentage of classes in training set:")
print(y_train.value_counts(normalize=True))
print("Percentage of classes in validation set:")
print(y_val.value_counts(normalize=True))
print("Percentage of classes in test set:")
print(y_test.value_counts(normalize=True))
Shape of Training set : (6075, 17) Shape of validation set : (2026, 17) Shape of test set : (2026, 17) Percentage of classes in training set: 1 0.839 0 0.161 Name: Attrition_Flag, dtype: float64 Percentage of classes in validation set: 1 0.839 0 0.161 Name: Attrition_Flag, dtype: float64 Percentage of classes in test set: 1 0.840 0 0.160 Name: Attrition_Flag, dtype: float64
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
"""
Function to compute different metrics to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
"""
# predicting using the independent variables
pred = model.predict(predictors)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{"Accuracy": acc, "Recall": recall, "Precision": precision, "F1": f1,},
index=[0],
)
return df_perf
def confusion_matrix_sklearn(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
To prevent Data Leakage, we will treat the missing values after splitting data set
# Looking at the columns that have nulls
data.isnull().sum().sort_values(ascending=False)
Education_Level 1519 Marital_Status 749 Attrition_Flag 0 Contacts_Count_12_mon 0 Total_Ct_Chng_Q4_Q1 0 Total_Trans_Ct 0 Total_Amt_Chng_Q4_Q1 0 Avg_Open_To_Buy 0 Total_Revolving_Bal 0 Credit_Limit 0 Months_Inactive_12_mon 0 Customer_Age 0 Total_Relationship_Count 0 Card_Category 0 Income_Category 0 Dependent_count 0 Gender 0 Avg_Utilization_Ratio 0 dtype: int64
Education_Level and Marital_Status have nulls.
# Let's impute the missing values using SimpleImpute.We will use the mode as they are a categorical column.
imp_mode = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
cols_to_impute = ["Education_Level", "Marital_Status"]
# fit and transform the imputer on train data
X_train[cols_to_impute] = imp_mode.fit_transform(X_train[cols_to_impute])
# Transform on validation and test data
X_val[cols_to_impute] = imp_mode.transform(X_val[cols_to_impute])
# fit and transform the imputer on test data
X_test[cols_to_impute] = imp_mode.transform(X_test[cols_to_impute])
# Checking that no column has missing values in train or test sets
print(X_train.isna().sum())
print("-" * 30)
print(X_test.isna().sum())
Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64 ------------------------------ Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
# Creating dummy variables for categorical variables
X_train = pd.get_dummies(data=X_train, drop_first=True)
X_val = pd.get_dummies(data=X_val, drop_first=True)
X_test = pd.get_dummies(data=X_test, drop_first=True)
#Storing the feature_names in a list for use in calculating feature importance later
feature_names = list(X_train.columns)
models = [] # Empty list to store all the models
# Appending models into the list
models.append(("Bagging", BaggingClassifier(random_state=1)))
models.append(("Random forest", RandomForestClassifier(random_state=1)))
models.append(("GBM", GradientBoostingClassifier(random_state=1)))
models.append(("Adaboost", AdaBoostClassifier(random_state=1)))
models.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="logloss")))
models.append(("dtree", DecisionTreeClassifier(random_state=1)))
results = [] # Empty list to store all model's CV scores
names = [] # Empty list to store name of the models
score = []
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance:" "\n")
for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
results.append(cv_result)
names.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))
print("\n" "Validation Performance:" "\n")
for name, model in models:
model.fit(X_train, y_train)
scores = recall_score(y_val, model.predict(X_val))
score.append(scores)
print("{}: {}".format(name, scores))
Cross-Validation Performance: Bagging: 95.56791548807944 Random forest: 97.88208354980854 GBM: 97.58792741826858 Adaboost: 96.3522484558102 Xgboost: 97.2152821874579 dtree: 92.54774961996576 Validation Performance: Bagging: 0.9647058823529412 Random forest: 0.9776470588235294 GBM: 0.9788235294117648 Adaboost: 0.9647058823529412 Xgboost: 0.9752941176470589 dtree: 0.9329411764705883
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure()
fig.suptitle("Algorithm Comparison")
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
print("Before Oversampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before Oversampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
sm = SMOTE(
sampling_strategy=1, k_neighbors=5, random_state=1
) # Synthetic Minority Over Sampling Technique
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)
print("After Oversampling, counts of label 'Yes': {}".format(sum(y_train_over == 1)))
print("After Oversampling, counts of label 'No': {} \n".format(sum(y_train_over == 0)))
print("After Oversampling, the shape of train_X: {}".format(X_train_over.shape))
print("After Oversampling, the shape of train_y: {} \n".format(y_train_over.shape))
Before Oversampling, counts of label 'Yes': 5099 Before Oversampling, counts of label 'No': 976 After Oversampling, counts of label 'Yes': 5099 After Oversampling, counts of label 'No': 5099 After Oversampling, the shape of train_X: (10198, 36) After Oversampling, the shape of train_y: (10198,)
models_over = [] # Empty list to store all the models
# Appending models into the list
models_over.append(("Bagging", BaggingClassifier(random_state=1)))
models_over.append(("Random forest", RandomForestClassifier(random_state=1)))
models_over.append(("GBM", GradientBoostingClassifier(random_state=1)))
models_over.append(("Adaboost", AdaBoostClassifier(random_state=1)))
models_over.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="logloss")))
models_over.append(("dtree", DecisionTreeClassifier(random_state=1)))
results_over = [] # Empty list to store all model's CV scores
names_over = [] # Empty list to store name of the models
score_over = []
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance:" "\n")
for name, model in models_over:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold
)
results_over.append(cv_result)
names_over.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))
print("\n" "Validation Performance:" "\n")
for name, model in models_over:
model.fit(X_train_over, y_train_over)
scores = recall_score(y_val, model.predict(X_val))
score_over.append(scores)
print("{}: {}".format(name, scores))
Cross-Validation Performance: Bagging: 93.86160980584579 Random forest: 96.88191036964344 GBM: 96.56812715272567 Adaboost: 94.50874559833747 Xgboost: 96.90144124381843 dtree: 91.1354076468664 Validation Performance: Bagging: 0.9464705882352941 Random forest: 0.9676470588235294 GBM: 0.9723529411764706 Adaboost: 0.9529411764705882 Xgboost: 0.9688235294117648 dtree: 0.9258823529411765
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure()
fig.suptitle("Algorithm Comparison SMOTE")
ax = fig.add_subplot(111)
plt.boxplot(results_over)
ax.set_xticklabels(names_over)
plt.show()
rus = RandomUnderSampler(random_state=1)
X_train_under, y_train_under = rus.fit_resample(X_train, y_train)
print("Before Undersampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before Undersampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
print("After Undersampling, counts of label 'Yes': {}".format(sum(y_train_un == 1)))
print("After Undersampling, counts of label 'No': {} \n".format(sum(y_train_un == 0)))
print("After Undersampling, the shape of train_X: {}".format(X_train_un.shape))
print("After Undersampling, the shape of train_y: {} \n".format(y_train_un.shape))
Before Undersampling, counts of label 'Yes': 5099 Before Undersampling, counts of label 'No': 976 After Undersampling, counts of label 'Yes': 976 After Undersampling, counts of label 'No': 976 After Undersampling, the shape of train_X: (1952, 36) After Undersampling, the shape of train_y: (1952,)
models_under = [] # Empty list to store all the models
# Appending models into the list
models_under.append(("Bagging", BaggingClassifier(random_state=1)))
models_under.append(("Random forest", RandomForestClassifier(random_state=1)))
models_under.append(("GBM", GradientBoostingClassifier(random_state=1)))
models_under.append(("Adaboost", AdaBoostClassifier(random_state=1)))
models_under.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="logloss")))
models_under.append(("dtree", DecisionTreeClassifier(random_state=1)))
results_under = [] # Empty list to store all model's CV scores
names_under = [] # Empty list to store name of the models
score_under = []
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance:" "\n")
for name, model in models_under:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train_under, y=y_train_under, scoring=scoring, cv=kfold
)
results_under.append(cv_result)
names_under.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))
print("\n" "Validation Performance:" "\n")
for name, model in models_under:
model.fit(X_train_under, y_train_under)
scores = recall_score(y_val, model.predict(X_val))
score_under.append(scores)
print("{}: {}".format(name, scores))
Cross-Validation Performance: Bagging: 82.78440607012035 Random forest: 87.9084249084249 GBM: 88.0130821559393 Adaboost: 85.76085818942963 Xgboost: 87.70643642072213 dtree: 80.12140240711669 Validation Performance: Bagging: 0.8641176470588235 Random forest: 0.8947058823529411 GBM: 0.8935294117647059 Adaboost: 0.8811764705882353 Xgboost: 0.9064705882352941 dtree: 0.8476470588235294
# Plotting boxplots for CV scores of all models defined above
fig = plt.figure()
fig.suptitle("Algorithm Comparison UnderSampling")
ax = fig.add_subplot(111)
plt.boxplot(results_under)
ax.set_xticklabels(names_under)
plt.show()
# defining model
model = GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),random_state=1)
# Grid of parameters to choose from
parameters = {
"n_estimators": [100,150,200,250],
"subsample":[0.8,0.9,1],
"max_features":[0.7,0.8,0.9,1]
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=model, param_distributions=parameters, n_jobs = -1, n_iter=50, scoring=scorer, cv=5, random_state=1)
#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))
Best parameters are {'subsample': 0.9, 'n_estimators': 100, 'max_features': 1} with CV score=0.9898021897669764:
# building model with best parameters
gbm_tuned2 = GradientBoostingClassifier(
n_estimators=100,
subsample=0.9,
max_features = 1,
random_state=1,
)
# Fit the model on training data
gbm_tuned2.fit(X_train, y_train)
GradientBoostingClassifier(max_features=1, random_state=1, subsample=0.9)
# Calculating different metrics on train set
gbm_random_train = model_performance_classification_sklearn(
gbm_tuned2, X_train, y_train
)
print("Training performance:")
print(gbm_random_train)
print("*************************************")
# creating confusion matrix
confusion_matrix_sklearn(gbm_tuned2, X_train, y_train)
Training performance: Accuracy Recall Precision F1 0 0.902 0.995 0.899 0.944 *************************************
Hyperparameter tuning did help to improve the Recall score, the recall score for the training data is 0.995. Lets check validation data to confirm that the model is not overfitting
# defining model
model = RandomForestClassifier(random_state=1, bootstrap=True)
# Grid of parameters to choose from
parameters = {
'max_depth': list(np.arange(5,30,5)) + [None],
'max_features': ['sqrt','log2',None],
'min_samples_leaf': np.arange(1,15,5),
'min_samples_split': np.arange(2, 20, 5),
'n_estimators': np.arange(10,110,10)}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=model, param_distributions=parameters, n_jobs = -1, n_iter=50, scoring=scorer, cv=5, random_state=1)
#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))
Best parameters are {'n_estimators': 50, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 5} with CV score=0.9917629740807019:
# building model with best parameters
rf_tuned2 = RandomForestClassifier(
bootstrap=True,
n_estimators=50,
min_samples_split =7,
min_samples_leaf=1,
max_features = 'log2',
max_depth = 5
)
# Fit the model on training data
rf_tuned2.fit(X_train, y_train)
RandomForestClassifier(max_depth=5, max_features='log2', min_samples_split=7,
n_estimators=50)
# Calculating different metrics on train set
rf_random_train = model_performance_classification_sklearn(
rf_tuned2, X_train, y_train
)
print("Training performance:")
print(rf_random_train)
print("*************************************")
# creating confusion matrix
confusion_matrix_sklearn(rf_tuned2, X_train, y_train)
Training performance: Accuracy Recall Precision F1 0 0.902 0.993 0.901 0.945 *************************************
%%time
# defining model
model = XGBClassifier(random_state=1,eval_metric='logloss')
# Parameter grid to pass in RandomizedSearchCV
parameters={'n_estimators':np.arange(50,150,50),
'scale_pos_weight':[2,5,10],
'learning_rate':[0.01,0.1,0.2,0.05],
'gamma':[0,1,3,5],
'subsample':[0.8,0.9,1],
'max_depth':np.arange(1,5,1),
'reg_lambda':[5,10]}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
xgb_tuned2 = RandomizedSearchCV(estimator=model, param_distributions=parameters, n_iter=50, scoring=scorer, cv=5, random_state=1, n_jobs = -1)
#Fitting parameters in RandomizedSearchCV
xgb_tuned2.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(xgb_tuned2.best_params_,xgb_tuned2.best_score_))
Best parameters are {'subsample': 0.9, 'scale_pos_weight': 10, 'reg_lambda': 5, 'n_estimators': 50, 'max_depth': 1, 'learning_rate': 0.01, 'gamma': 1} with CV score=1.0:
Wall time: 47.7 s
# building model with best parameters
xgb_tuned2 = XGBClassifier(
random_state=1,
n_estimators=50,
scale_pos_weight=10,
gamma=1,
subsample=0.9,
learning_rate=0.01,
eval_metric="logloss",
max_depth=1,
reg_lambda=5,
)
# Fit the model on training data
xgb_tuned2.fit(X_train, y_train)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
eval_metric='logloss', gamma=1, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.01, max_delta_step=0,
max_depth=1, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=50, n_jobs=8,
num_parallel_tree=1, predictor='auto', random_state=1,
reg_alpha=0, reg_lambda=5, scale_pos_weight=10, subsample=0.9,
tree_method='exact', validate_parameters=1, verbosity=None)
# Calculating different metrics on train set
xgboost_random_train = model_performance_classification_sklearn(
xgb_tuned2, X_train, y_train
)
print("Training performance:")
print(xgboost_random_train)
print("*************************************")
# creating confusion matrix
confusion_matrix_sklearn(xgb_tuned2, X_train, y_train)
Training performance: Accuracy Recall Precision F1 0 0.839 1.000 0.839 0.913 *************************************
Recall is 1 so the model is overfitting
# training performance comparison
models_train_comp_df = pd.concat(
[
gbm_random_train.T,
rf_random_train.T,
xgboost_random_train.T,
],
axis=1,
)
models_train_comp_df.columns = [
"GBM Tuned with Random search",
"Random Forest Tuned with Random search",
"Xgboost Tuned with Random Search",
]
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| GBM Tuned with Random search | Random Forest Tuned with Random search | Xgboost Tuned with Random Search | |
|---|---|---|---|
| Accuracy | 0.902 | 0.902 | 0.839 |
| Recall | 0.995 | 0.993 | 1.000 |
| Precision | 0.899 | 0.901 | 0.839 |
| F1 | 0.944 | 0.945 | 0.913 |
XGBoost tuned gave the recall score of 1 but it is over fitting. Then GBM Tuned had a good recall at 0.995 followed by Random Forest at 0.993. All the models might be overfitting, we can check with validation data.
gbm_random_val = model_performance_classification_sklearn(gbm_tuned2, X_val, y_val)
print("Validation performance:")
gbm_random_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.890 | 0.993 | 0.889 | 0.938 |
# Calculating different metrics on validation set
rf_random_val = model_performance_classification_sklearn(rf_tuned2, X_val, y_val)
print("Validation performance:")
rf_random_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.889 | 0.992 | 0.889 | 0.938 |
xgb_random_val = model_performance_classification_sklearn(xgb_tuned2, X_val, y_val)
print("Validation performance:")
xgb_random_val
XGB performance is best with Recall 1 for validation set. Then GBM tuned with Recall of 0.993 and Random Forest with 0.992.
importances = xgb_tuned2.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances for XGB")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
importances = gbm_tuned2.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances for GBM")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
importances = rf_tuned2.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances for Random Forest")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
The top features that are indicated in the three models are Total_Trans_Ct, Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio, Total_Revolving_Bal, Total_Amt_Chng_Q4_Q1, Contacts_count_12_mon, and Months_Inactive_12_mon.
# Calculating different metrics on the test set
xgb_random_test = model_performance_classification_sklearn(xgb_tuned2, X_test, y_test)
print("Validation performance:")
xgb_random_test
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.840 | 1.000 | 0.840 | 0.913 |
The performance on test data was also very good, the Recall is 1 for test data too.
# creating a list of numerical variables
numerical_features = [
"Customer_Age",
"Months_Inactive_12_mon",
"Contacts_Count_12_mon",
"Credit_Limit",
"Total_Revolving_Bal",
"Avg_Open_To_Buy",
"Total_Amt_Chng_Q4_Q1",
"Total_Trans_Ct",
"Total_Ct_Chng_Q4_Q1",
"Avg_Utilization_Ratio",
]
# None of the numeric fields were null but we will still use the simpleImputer incase there are nulls int he future
# creating a transformer for numerical variables, which will apply simple imputer on the numerical variables
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
# creating a list of categorical variables
categorical_features = [
"Gender",
"Dependent_count",
"Education_Level",
"Marital_Status",
"Income_Category",
"Card_Category",
"Total_Relationship_Count"
]
# creating a transformer for categorical variables, which will first apply simple imputer and
#then do one hot encoding for categorical variables
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder(handle_unknown="ignore")),
]
)
# handle_unknown = "ignore", allows model to handle any unknown category in the test data
# combining categorical transformer and numerical transformer using a column transformer
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numerical_features),
("cat", categorical_transformer, categorical_features),
],
remainder="passthrough",
)
# remainder = "passthrough" has been used, it will allow variables that are present in original data
# but not in "numerical_columns" and "categorical_columns" to pass through the column transformer without any changes
# Creating a copy to build the model
data1 = data.copy()
# Separating target variable and other variables
X = data1.drop(columns="Attrition_Flag")
Y = data1["Attrition_Flag"]
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.30, random_state=1, stratify=Y
)
print(X_train.shape, X_test.shape)
(7088, 17) (3039, 17)
# Creating new pipeline with best parameters
model = Pipeline(
steps=[
("pre", preprocessor),
(
"XGB",
XGBClassifier(
random_state=1,
n_estimators=50,
scale_pos_weight=10,
gamma=1,
subsample=0.9,
learning_rate=0.01,
eval_metric="logloss",
max_depth=1,
reg_lambda=5,
),
),
]
)
# Fit the model on training data
model.fit(X_train, y_train)
Pipeline(steps=[('pre',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='median'))]),
['Customer_Age',
'Months_Inactive_12_mon',
'Contacts_Count_12_mon',
'Credit_Limit',
'Total_Revolving_Bal',
'Avg_Open_To_Buy',
'Total_Amt_Chng_Q4_Q1',
'Total_Trans_Ct',
'Total_Ct_Chng_Q4_Q1',
'Avg_Utilization_Ra...
eval_metric='logloss', gamma=1, gpu_id=-1,
importance_type=None, interaction_constraints='',
learning_rate=0.01, max_delta_step=0,
max_depth=1, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=50,
n_jobs=8, num_parallel_tree=1, predictor='auto',
random_state=1, reg_alpha=0, reg_lambda=5,
scale_pos_weight=10, subsample=0.9,
tree_method='exact', validate_parameters=1,
verbosity=None))])
# transforming and predicting on test data
model.predict(X_test)
array([1, 1, 1, ..., 1, 1, 1], dtype=int64)